#!/usr/bin/python
# -*- coding: ISO-8859-1 -*-
from __future__ import division
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals
###########################
### Autor: Sebastian Enger / M.Sc.
### Copyright: Sebastian Enger
### Licence: Commercial / OneTipp
### Version: 1.0.8 - 21-10-2015@23:53 Uhr
### Contact: sebastian.enger@gmail.com
### OneTipp Text Tool in Python
###########################
#https://docs.python.org/2/library/configparser.html
import os
import nltk # apt-get install python-mysqldb
from sphinxit.core.processor import Search # http://sphinxit.readthedocs.org/en/latest/
import codecs
import re
from transliterate import translit, get_available_language_codes
import onetipp
import sys
sys.path.append('/home/onetipp/python/modules')
os.environ['PYTHON_EGG_CACHE'] = '/home/compress/'
sys.setdefaultencoding('utf-8')
noDoubleHash = set()
###re_match = r"[(\?|\.|\!)][(\t|\r|\n|\s|\w){0,}]([A-Za-z0-9]{1,})" # Match: ". WORT"
re_match = r"(\?|\.|\!)$" # Match: ". WORT"
# lies die Ein und Ausgabedateien
inputfile = sys.argv[1]
outputfile = sys.argv[2]
# http://www.tutorialspoint.com/python/python_command_line_arguments.htm
# read file into string
text = open(inputfile, 'r').read()
#text.decode('utf-8')
text = text.decode("utf-8")
# sent_tokenize_list = sent_tokenize(text)
# Summarize the text first and then work on it
tSumy = onetipp.summarizeText(text)
tokens = nltk.word_tokenize(tSumy)
tokensRaw = nltk.word_tokenize(text)
count = -1
changeEveryWord = 8 #Leistungsschutzrecht: 7 Zeichen dürfen genutzt werden, darüber muss geändert werden
changeEveryWordFlag = 0
changeEveryWordTemp = 0 #temporary upcount
for word in tokens:
count += 1
wordTemp = word.encode('ascii', 'ignore')
# cursorMysql.execute("SELECT * FROM (namen_table) WHERE name LIKE '%s%%' LIMIT 1;" % (word))
onetipp.cursorMysql.execute("SELECT * FROM (namen_table) WHERE BINARY `name` = '%s' LIMIT 1;" % (wordTemp))
name_content = onetipp.cursorMysql.fetchone()
#print ("SELECT * FROM (namen_table) WHERE name LIKE '%s' LIMIT 1;" % (word))
#print (name_content)
# search_query = Search(indexes=['onetipp_name'], config=SphinxitConfig)
# # search_query = search_query.match(word).options(
# search_query = search_query.match(word).options(
# ranker='proximity_bm25',
# max_matches=1,
# max_query_time=350,
# field_weights={'name': 100, 'gender': -10000, 'language': -10000, 'meaning': -10000},
# )
###sphinx_result = search_query.ask()
# exit(0)
# es wurde ein namen gefunden -> kein synonym austauschen
if name_content is not None:
# print("Token: ", tokens)
#print("Count: ", count)
#print("
")
#print("Tokencount overall: ", len(tokens))
#print("
")
# tokens[count] = '' + deumlaut(word) + ''
tokens[count] = onetipp.deumlaut(word)
tokensRaw[count] = onetipp.deumlaut(word)
# print "Namen erkannt und nicht getauscht"
continue
else:
1
if changeEveryWordTemp == (changeEveryWord - 1):
changeEveryWordFlag = 0
changeEveryWordTemp = 0
else:
1
if changeEveryWordFlag == 1:
changeEveryWordTemp += 1
else:
1
if len(word) >=4 and changeEveryWordFlag == 0:
# Versuche zuerst die Leipzig DB anzufordern
lstcWord = word[0:1]
synDictLeipzig = {}
sLeipzigList = onetipp.getSynLeipzig(word)
if sLeipzigList:
for wSynL in sLeipzigList:
#synDict[SynRanker(wSyn, word)] = wSyn
if wSynL not in noDoubleHash:
synDictLeipzig[wSynL] = onetipp.SynRanker(wSynL, word)
sortedSynList = []
sortedSynList = sorted(synDictLeipzig.items(), key=lambda x: x[1], reverse=True)
firstBestSynHit = str(sortedSynList[0][0])
firstBestSynHitRank = str(sortedSynList[0][1])
# Hat das letzte Wort ein Satzendenzeichen, schreibe das aktuell gleich mal gross
if re.search(re_match, tokens[count-1]) is not None:
firstBestSynHit.title()
if word.endswith('.'):
firstBestSynHit += '.'
elif word.endswith('?'):
firstBestSynHit += '?'
elif word.endswith('!'):
firstBestSynHit += '!'
elif word.endswith(','):
firstBestSynHit += ','
elif word.endswith(';'):
firstBestSynHit += ';'
elif word.endswith(':'):
firstBestSynHit += ':'
# later: Randomly choose one of the synonyms that have all the highest rating
# tokens[count] = '' + deumlaut(
# firstBestSynHit) + ''
tokens[count] = onetipp.deumlaut(firstBestSynHit)
noDoubleHash.add(firstBestSynHit)
tokensRaw[count] = onetipp.deumlaut(firstBestSynHit)
changeEveryWordFlag = 1
changeEveryWordTemp += 1
else:
#nutze unsere lokale Synonym Mysql Datenbank
search_query_syn = Search(indexes=['onetipp_syn_simple'], config=onetipp.SphinxitConfig)
search_query_syn = search_query_syn.match(word).options(
ranker='proximity_bm25',
max_matches=1,
max_query_time=350,
field_weights={'synonyms': 100},
)
sphinx_result_syn = search_query_syn.ask()
synID = 0
try:
synID = sphinx_result_syn['result']['items'][0].values()[0]
if synID > 0:
# print "SynDB has been found: ", synID
#später finde via sphinx noch mehr synonyme und parse diese alle
sql = "SELECT synonyms FROM (synonym_unique_simple) WHERE uid= %s" % (synID)
onetipp.cursorMysql.execute(sql)
syn_content = onetipp.cursorMysql.fetchone()
synContent = list(syn_content)
synContent = synContent[0].decode(encoding="utf-8", errors="ignore")
if syn_content:
synwords = synContent.split(";")
# print SynDictCalculator(synwords)
# http://www.saltycrane.com/blog/2007/09/how-to-sort-python-dictionary-by-keys/
# for key, value in sorted(mydict.iteritems(), key=lambda (k,v): (v,k)):
# print "%s: %s" % (key, value)
synDict = {}
for wSyn in synwords:
#synDict[SynRanker(wSyn, word)] = wSyn
if wSyn not in noDoubleHash:
synDict[wSyn] = onetipp.SynRanker(wSyn, word)
sortedSynList = []
sortedSynList = sorted(synDict.items(), key=lambda x: x[1], reverse=True)
firstBestSynHit = str(sortedSynList[0][0])
firstBestSynHitRank = str(sortedSynList[0][1])
# Hat das letzte Wort ein Satzendenzeichen, schreibe das aktuell gleich mal gross
if re.search(re_match, tokens[count-1]) is not None:
firstBestSynHit.title()
if word.endswith('.'):
firstBestSynHit += '.'
elif word.endswith('?'):
firstBestSynHit += '?'
elif word.endswith('!'):
firstBestSynHit += '!'
elif word.endswith(','):
firstBestSynHit += ','
elif word.endswith(';'):
firstBestSynHit += ';'
elif word.endswith(':'):
firstBestSynHit += ':'
# later: Randomly choose one of the synonyms that have all the highest rating
# tokens[count] = '' + deumlaut(firstBestSynHit) + ''
tokens[count] = onetipp.deumlaut(firstBestSynHit)
noDoubleHash.add(firstBestSynHit)
tokensRaw[count] = onetipp.deumlaut(firstBestSynHit)
changeEveryWordFlag = 1
changeEveryWordTemp += 1
#break
except IndexError:
1
# file schreiben
outputtext = ' '.join(tokens)
outputtextRaw = ' '.join(tokensRaw)
readabilityVar = str(textstat.flesch_reading_ease(outputtextRaw))
with codecs.open(outputfile, 'w') as f:
f.write(outputtext )
# f.write("Lesbarkeitswert : " + readabilityVar)
#f.write("
")
#f.write(outputtext)
#f.write("
")
#f.write("RUSSISCHE TRANSLITERATION: BEISPIEL VERSION")
#f.write("
")
#f.write(translit(outputtextRaw, 'ru'))
f.close()
onetipp.mysql.commit()
onetipp.mysql.close()
exit(0)
"""
The Flesch Reading Ease formula
function name - flesch_reading_ease(text)
returns the Flesch Reading Ease Score. Following table is helpful to access the ease of readability in a document.
90-100 : Very Easy
80-89 : Easy
70-79 : Fairly Easy
60-69 : Standard
50-59 : Fairly Difficult
30-49 : Difficult
0-29 : Very Confusing
"""